# from urllib.robotparser import RobotFileParser
#
# rp = RobotFileParser()
# rp.set_url('https://www.baidu.com/robots.txt')
# # read方法执行读取和分析操作，不调用此方法，后续判断都会为false
# rp.read()
# # can_fetch有两个参数，User-Agent和URL，返回结果是True或False，表示User-Agent指示的搜索引擎是否可以抓取这个URL
# print(rp.can_fetch('Baiduspider', 'https://www.baidu.com'))
# print(rp.can_fetch('Baiduspider', 'https://www.baidu.com/homepage/'))
# print(rp.can_fetch('Googlebot', 'https://www.baidu.com/homepage/'))

# 用parse方法对robots.txt文件读取和分析
from urllib.request import urlopen
from urllib.robotparser import RobotFileParser

rp = RobotFileParser()
rp.parse(urlopen('https://www.baidu.com/robots.txt').read().decode('utf-8').split('\n'))
print(rp.can_fetch('Baiduspider', 'https://www.baidu.com'))
print(rp.can_fetch('Baiduspider', 'https://www.baidu.com/homepage/'))
print(rp.can_fetch('Googlebot', 'https://www.baidu.com/homepage/'))

